In this notebook I will import gene counts file that were generated by Bowtie –> FeatureCounts

Check working directory

getwd()
## [1] "/Volumes/Bumblebee/O.lurida_QuantSeq-2020/notebooks"

Load libraries and source scripts

source("../references/biostats.R")

list.of.packages <- c("DESeq2", "RCurl", "tidyverse", "vegan", "pheatmap", "pastecs", "factoextra", "FactoMineR", "RColorBrewer", "tibble", "reshape2", "plotly", "corrplot", "PerformanceAnalytics") #add new libraries here 
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)

# Load all libraries 
lapply(list.of.packages, FUN = function(X) {
  do.call("require", list(X)) 
})

Import QuantSeq sample info, library/file names, and then join

sample.info <- read.csv("../raw-data/quantseq2020_key.csv", header=T, na.strings = NA) %>%
          mutate_at(vars(lane, temp.parent), as.factor) %>% 
          mutate_at(vars(RNA.conc, endpoint.RFU, bioan.mean.bp), as.numeric) %>%
          mutate(sample_stage=paste(sample, stage, sep="_"))

filenames <- data.frame(read.table("../qc-processing/featureCounts/filenames.txt", header = F, stringsAsFactors = F, fill = FALSE)) %>%
          dplyr::rename(filename = 1) %>%
          mutate(sample = as.character(gsub("_.*","", filename))) %>% 
          left_join(sample.info[c("sample", "sample_stage")])
## Joining, by = "sample"
## Warning: Column `sample` joining character vector and factor, coercing into
## character vector
# Fill in NA value on the "Undetermined" row  
filenames[filenames$filename == "Undetermined","sample_stage"] <- "Undetermined"

key <- full_join(sample.info, filenames)
## Joining, by = c("sample", "sample_stage")
## Warning: Column `sample` joining factor and character vector, coercing into
## character vector
save(key, file="../raw-data/key")

Import counts matrix file as dataframe.

counts <- data.frame(read.table("../qc-processing/featureCounts/featurecounts-gene2kbdown.Rmatrix.txt", header = T, stringsAsFactors = F, fill = FALSE)) %>%
          column_to_rownames(var="Geneid") %>%
          rename_all(~as.character(filenames$sample_stage))
save(counts, file = "../results/gene-counts") #save R object with all gene counts, before filtering, removing samples, etc. 

Summarize counts and visualize (remove last column - that’s undetermined counts)

print(paste("Number of samples:", ncol(counts[,-ncol(counts)]), sep=" "))
## [1] "Number of samples: 146"
print(paste("Total number of genes in dataframe:", prettyNum(nrow(counts[,-ncol(counts)]), big.mark = ","), sep=" "))
## [1] "Total number of genes in dataframe: 32,210"
print(paste("Total counts, all samples:", prettyNum(sum(colSums(counts[,-ncol(counts)])), big.mark = ","), sep=" "))
## [1] "Total counts, all samples: 411,542,439"
#print(paste("Counts for", colnames(counts), ":",  prettyNum(colSums(counts), big.mark = ","), sep=" "))

#inspect total counts by sample
#ggplotly(
  ggplot(data.frame(colSums(counts[,-ncol(counts)])) %>% dplyr::rename(count.total = 1) %>% rownames_to_column(var="sample"))  + geom_bar(aes(x=sample, y=count.total), stat = "identity") + ggtitle("Total count by sample") + 
             theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) #) 

Filter sample sequencing stats

Inspect total # reads, set minimum for retaining samples

multiqc.stats <- read.delim(file="../qc-processing/fastqc/trimmed/multiqc_report_trimmed_data/multiqc_general_stats.txt", sep = "\t") 
colnames(multiqc.stats) <- gsub("FastQC_mqc.generalstats.fastqc.", "", colnames(multiqc.stats)) 
multiqc.stats <- multiqc.stats %>% 
  mutate(total_unique = total_sequences*(100-percent_duplicates)/100) %>%
  mutate(Sample = gsub(".trim", "", Sample)) %>% 
  left_join(key, by=c("Sample"="sample"))

Investigate possible correlations among # reads and prep stuff

No. of PCR cycles strongly correlates positively % duplicates & % fails and negatively with total unique reads, less strongly with GC content.

cor(multiqc.stats %>%drop_na(population) %>% dplyr::select_if(is.numeric)) %>% corrplot(tl.cex=.45)

## Look closer at pcr cycles and read stummary stats

chart.Correlation(multiqc.stats %>%drop_na(population) %>%
                    select(pcr.cycles, percent_duplicates, 
                           percent_gc, percent_fails, total_unique), 
                  histogram=F, pch=19)

It looks like the % dupliates increases with PCR cycles, so I presume that there are artifacts of PCR overamplification. Since i have SE reads w/o molecular identifiers, 100bp dont remove duplicates, but proceed with caution - 10.7717/peerj.3091

Plot percent_duplicates ~ pcr.cycles, indicating sample #’s

ggplotly(
  ggplot(multiqc.stats %>%drop_na(population), 
                aes(x=pcr.cycles, y=percent_duplicates, 
                    label=Sample, color=stage, shape=pCO2.parent)) + 
           geom_point(size=3))
#scale_color_manual(values=c("black", "red")) +

I think I should remove all samples with PCR cycles > 17 - what would those be?

# How many of each tissue/population/treatment would I throw out? 
multiqc.stats %>% filter(pcr.cycles>17) %>% group_by(stage, population, pCO2.parent) %>% count() %>% View()
## Warning: Factor `pCO2.parent` contains implicit NA, consider using
## `forcats::fct_explicit_na`

## Warning: Factor `pCO2.parent` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## Warning in system2("/usr/bin/otool", c("-L", shQuote(DSO)), stdout = TRUE):
## running command ''/usr/bin/otool' -L '/Library/Frameworks/R.framework/Resources/
## modules/R_de.so'' had status 1
# How many would I retain? 
multiqc.stats %>% filter(pcr.cycles<=17) %>% group_by(stage, population, pCO2.parent) %>% count() %>% View()
## Warning in system2("/usr/bin/otool", c("-L", shQuote(DSO)), stdout = TRUE):
## running command ''/usr/bin/otool' -L '/Library/Frameworks/R.framework/Resources/
## modules/R_de.so'' had status 1
# this looks fine only big discrepancy is in Fidalgo Bay larvae 

Check out pcr.cycles correation after removing all samples with >17 pcr cycles

cor(multiqc.stats %>%drop_na(population) %>% filter(pcr.cycles<=17) %>% dplyr::select_if(is.numeric)) %>% corrplot(tl.cex=.45)

chart.Correlation(multiqc.stats %>%drop_na(population) %>% filter(pcr.cycles<=17) %>%
                    select(pcr.cycles, percent_duplicates, 
                           percent_gc, percent_fails, total_unique), 
                  histogram=F, pch=19)

# much better. 

FILTER WHOLE SAMPLES: RemovE whole samples from the data set here - those with very low counts, with high PCR cycles and duplicates

Samples 35, 412, and 44 have very low counts, similar to sample 571 which was the no-tissue control, prepared alongside larval samples.

remove.list <- c("Undetermined", multiqc.stats %>% filter(pcr.cycles>17) %>% select(sample_stage) %>% unlist() %>% as.vector()) 
counts.filtered <- counts[ , -which(names(counts) %in% remove.list)]
ncol(counts.filtered)
## [1] 129
key <- key[ -which(key$sample_stage %in% remove.list), ]
nrow(key)
## [1] 129
nrow(key) == ncol(counts.filtered) #should = TRUE.
## [1] TRUE
ggplotly(ggplot(data = data.frame(colSums(counts.filtered)) %>%
                  dplyr::rename(count.total = 1) %>%
                  rownames_to_column(var="sample")) +
           geom_bar(aes(x=sample, y=count.total), stat = "identity") + ggtitle("Total count by sample") +
             theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()))

## OPTIONAL: Remove genes that were in the No-Tissue Control, look at counts again

# remove.genes <- counts %>% select("571_larvae") %>% rownames_to_column("gene") %>% filter(`571_larvae` > 100) %>% select(gene) %>% unlist() %>% as.vector()
# counts.filtered <- counts #[-which(rownames(counts) %in% remove.genes),]

Transpose dataframe so each row = a sample (aka “objects”), and each column = genes (aka “variables”)

counts.t <- t(counts.filtered) #transform data to have each sample a row, each column a gene 

Save counts file, and transformed counts file

save(counts.filtered, file = "../results/gene-counts-filtered")
save(counts.t, file = "../results/gene-counts-trans")

Import gene feature file to get gene ID info for GO enrichment analysis, etc.

# Read in O. lurida gene file that connects OLUR gene ID to uniprot accession number 
Olurida_gene_uniprot <- read_delim(file = here::here("references", "Olurida_v081-20190709-UniprotID.gff"), skip = 1, delim = "\t", col_names = c("contig", "source", "feature", "start", "end", "unknown1", "strand", "unknown2", "notes")) %>%
# Split giant gene "Notes" column into separate columns 
  mutate(ID=str_extract(notes, "ID=(.*?);"),
       Parent=str_extract(notes, "Parent=(.*?);"),
       Name=str_extract(notes, "Name=(.*?);"),
       Alias=str_extract(notes, "Alias=(.*?);"),
       AED=str_extract(notes, "AED=(.*?);"),
       eAED=str_extract(notes, "eAED=(.*?);"),
       Note=str_extract(notes, "Note=(.*?);"),
       Ontology_term=str_extract(notes, "Ontology_term=(.*?);"),
       Dbxref=str_extract(notes, "Dbxref=(.*?);"),
       SPID=str_extract(notes, "SPID=(.*?);")
       ) %>% 
  
  #remove extraneous info from Olur gene ID and Uniprot species ID ("SPID")
  mutate(Name=str_remove(Name, "Name=")) %>% mutate(Name=str_remove(Name, ";")) %>%
  mutate(SPID=str_remove(SPID, "SPID=")) %>% mutate(SPID=str_remove(SPID, ";"))
## Warning in readLines(f, n): line 1 appears to contain an embedded nul
## Warning in readLines(f, n): incomplete final line found on '/Volumes/Bumblebee/
## O.lurida_QuantSeq-2020/._QuantSeq-04-21-2020.Rproj'
## Parsed with column specification:
## cols(
##   contig = col_character(),
##   source = col_character(),
##   feature = col_character(),
##   start = col_double(),
##   end = col_double(),
##   unknown1 = col_character(),
##   strand = col_character(),
##   unknown2 = col_character(),
##   notes = col_character()
## )
save(Olurida_gene_uniprot, file = "../references/Olurida_gene_uniprot")